setwd("~/Dropbox/clado-manuscript/Mikes_MS_Data/")
#source('http://bioconductor.org/biocLite.R')
#biocLite('phyloseq')
library(phyloseq)
library(ggplot2)
library(plyr)
library(dplyr)
library(Rmisc)
library(DESeq2)
library(doParallel)
# Load biom file.
biom <- import_biom("OTU_table.biom", "~/Dropbox/clado-manuscript/Nephele/PipelineResults_NMEPINZ20QK1/nephele_outputs/tree.tre", parseFunction=parse_taxonomy_greengenes)
sam.data <- read.csv(file="sample.data.csv", row.names=1, header=TRUE)
head(sam.data)
## TreatmentGroup Site Date Description
## C172N1 Early North 172 Sample of day 172 at site North 1
## C172N2 Early North 172 Sample of day 172 at site North 2
## C172N3 Early North 172 Sample of day 172 at site North 3
## C172P1 Early Point 172 Sample of day 172 at site Point 1
## C172P2 Early Point 172 Sample of day 172 at site Point 2
## C172P3 Early Point 172 Sample of day 172 at site Point 3
## SampleID.1
## C172N1 C172N1
## C172N2 C172N2
## C172N3 C172N3
## C172P1 C172P1
## C172P2 C172P2
## C172P3 C172P3
sam.data$Date <- as.factor(sam.data$Date)
sam.data$DateSite <- paste(sam.data$Date, sam.data$Site)
head(sam.data); str(sam.data)
## TreatmentGroup Site Date Description
## C172N1 Early North 172 Sample of day 172 at site North 1
## C172N2 Early North 172 Sample of day 172 at site North 2
## C172N3 Early North 172 Sample of day 172 at site North 3
## C172P1 Early Point 172 Sample of day 172 at site Point 1
## C172P2 Early Point 172 Sample of day 172 at site Point 2
## C172P3 Early Point 172 Sample of day 172 at site Point 3
## SampleID.1 DateSite
## C172N1 C172N1 172 North
## C172N2 C172N2 172 North
## C172N3 C172N3 172 North
## C172P1 C172P1 172 Point
## C172P2 C172P2 172 Point
## C172P3 C172P3 172 Point
## 'data.frame': 52 obs. of 6 variables:
## $ TreatmentGroup: Factor w/ 2 levels "Early","Late": 1 1 1 1 1 1 1 1 1 1 ...
## $ Site : Factor w/ 3 levels "North","Point",..: 1 1 1 2 2 2 3 3 3 1 ...
## $ Date : Factor w/ 6 levels "172","178","185",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ Description : Factor w/ 52 levels "Sample of day 172 at site North 1",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ SampleID.1 : Factor w/ 52 levels "C172N1","C172N2",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ DateSite : chr "172 North" "172 North" "172 North" "172 Point" ...
sample_data(biom) <- sam.data
biom; sample_data(biom)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 51928 taxa and 52 samples ]
## sample_data() Sample Data: [ 52 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 51928 taxa by 8 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 51928 tips and 51914 internal nodes ]
## Sample Data: [52 samples by 6 sample variables]:
## TreatmentGroup Site Date Description
## C178N1 Early North 178 Sample of day 178 at site North 1
## C178P1 Early Point 178 Sample of day 178 at site Point 1
## C185P2 Early Point 185 Sample of day 185 at site Point 2
## C206N2 Late North 206 Sample of day 206 at site North 2
## C206P1 Late Point 206 Sample of day 206 at site Point 1
## C206P2 Late Point 206 Sample of day 206 at site Point 2
## C214P1 Late Point 214 Sample of day 214 at site Point 1
## C214P2 Late Point 214 Sample of day 214 at site Point 2
## C214P3 Late Point 214 Sample of day 214 at site Point 3
## C214S1 Late South 214 Sample of day 214 at site South 1
## C214S2 Late South 214 Sample of day 214 at site South 2
## C214S3 Late South 214 Sample of day 214 at site South 3
## C185P1 Early Point 185 Sample of day 185 at site Point 1
## C185P3 Early Point 185 Sample of day 185 at site Point 3
## C199P3 Late Point 199 Sample of day 199 at site Point 3
## C199S2 Late South 199 Sample of day 199 at site South 2
## C199S3 Late South 199 Sample of day 199 at site South 3
## C206N1 Late North 206 Sample of day 206 at site North 1
## C178P2 Early Point 178 Sample of day 178 at site Point 2
## C199N3 Late North 199 Sample of day 199 at site North 3
## C206S1 Late South 206 Sample of day 206 at site South 1
## C214N3 Late North 214 Sample of day 214 at site North 3
## C199N2 Late North 199 Sample of day 199 at site North 2
## C206N3 Late North 206 Sample of day 206 at site North 3
## C206S2 Late South 206 Sample of day 206 at site South 2
## C199N1 Late North 199 Sample of day 199 at site North 1
## C199P1 Late Point 199 Sample of day 199 at site Point 1
## C199S1 Late South 199 Sample of day 199 at site South 1
## C214N1 Late North 214 Sample of day 214 at site North 1
## C172P1 Early Point 172 Sample of day 172 at site Point 1
## C199P2 Late Point 199 Sample of day 199 at site Point 2
## C172N3 Early North 172 Sample of day 172 at site North 3
## C172S3 Early South 172 Sample of day 172 at site South 3
## C178S2 Early South 178 Sample of day 178 at site South 2
## C178P3 Early Point 178 Sample of day 178 at site Point 3
## C178S3 Early South 178 Sample of day 178 at site South 3
## C172N1 Early North 172 Sample of day 172 at site North 1
## C172S1 Early South 172 Sample of day 172 at site South 1
## C178N3 Early North 178 Sample of day 178 at site North 3
## C185N2 Early North 185 Sample of day 185 at site North 2
## C185N3 Early North 185 Sample of day 185 at site North 3
## C185S3 Early South 185 Sample of day 185 at site South 3
## C214N2 Late North 214 Sample of day 214 at site North 2
## C172P2 Early Point 172 Sample of day 172 at site Point 2
## C185S2 Early South 185 Sample of day 185 at site South 2
## C172P3 Early Point 172 Sample of day 172 at site Point 3
## C185N1 Early North 185 Sample of day 185 at site North 1
## C172N2 Early North 172 Sample of day 172 at site North 2
## C178S1 Early South 178 Sample of day 178 at site South 1
## C185S1 Early South 185 Sample of day 185 at site South 1
## C172S2 Early South 172 Sample of day 172 at site South 2
## C178N2 Early North 178 Sample of day 178 at site North 2
## SampleID.1 DateSite
## C178N1 C178N1 178 North
## C178P1 C178P1 178 Point
## C185P2 C185P2 185 Point
## C206N2 C206N2 206 North
## C206P1 C206P1 206 Point
## C206P2 C206P2 206 Point
## C214P1 C214P1 214 Point
## C214P2 C214P2 214 Point
## C214P3 C214P3 214 Point
## C214S1 C214S1 214 South
## C214S2 C214S2 214 South
## C214S3 C214S3 214 South
## C185P1 C185P1 185 Point
## C185P3 C185P3 185 Point
## C199P3 C199P3 199 Point
## C199S2 C199S2 199 South
## C199S3 C199S3 199 South
## C206N1 C206N1 206 North
## C178P2 C178P2 178 Point
## C199N3 C199N3 199 North
## C206S1 C206S1 206 South
## C214N3 C214N3 214 North
## C199N2 C199N2 199 North
## C206N3 C206N3 206 North
## C206S2 C206S2 206 South
## C199N1 C199N1 199 North
## C199P1 C199P1 199 Point
## C199S1 C199S1 199 South
## C214N1 C214N1 214 North
## C172P1 C172P1 172 Point
## C199P2 C199P2 199 Point
## C172N3 C172N3 172 North
## C172S3 C172S3 172 South
## C178S2 C178S2 178 South
## C178P3 C178P3 178 Point
## C178S3 C178S3 178 South
## C172N1 C172N1 172 North
## C172S1 C172S1 172 South
## C178N3 C178N3 178 North
## C185N2 C185N2 185 North
## C185N3 C185N3 185 North
## C185S3 C185S3 185 South
## C214N2 C214N2 214 North
## C172P2 C172P2 172 Point
## C185S2 C185S2 185 South
## C172P3 C172P3 172 Point
## C185N1 C185N1 185 North
## C172N2 C172N2 172 North
## C178S1 C178S1 178 South
## C185S1 C185S1 185 South
## C172S2 C172S2 172 South
## C178N2 C178N2 178 North
head(otu_table(biom))
## OTU Table: [6 taxa and 52 samples]
## taxa are rows
## C178N1 C178P1 C185P2 C206N2 C206P1 C206P2
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 1
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 2 0 0 6 0 4
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 137 8 1 73 8 6
## C214P1 C214P2 C214P3 C214S1 C214S2 C214S3
## New.CleanUp.ReferenceOTU155901 0 0 1 0 1 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 9 4 11 2 1 0
## JQ945994.1.1399 1 0 0 1 0 0
## EF653577.1.1339 0 0 0 4 0 0
## JQ814729.1.1495 11 4 27 40 8 23
## C185P1 C185P3 C199P3 C199S2 C199S3 C206N1
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 1 0 0 0 0
## KC551585.1.1451 3 2 5 0 0 11
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 6 4 2 11 2 56
## C178P2 C199N3 C206S1 C214N3 C199N2 C206N3
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 0 1 4 3 0 3
## JQ945994.1.1399 0 0 2 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 10 23 6 19 5 849
## C206S2 C199N1 C199P1 C199S1 C214N1 C172P1
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 1 0 8 2 5 0
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 14 24 5 5 3 2
## C199P2 C172N3 C172S3 C178S2 C178P3 C178S3
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 2 5 8 0 1 0
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 2 121 31 0 0 5
## C172N1 C172S1 C178N3 C185N2 C185N3 C185S3
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0 0 0
## KC551585.1.1451 0 0 1 0 0 0
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 5 16 4 6 27 4
## C214N2 C172P2 C185S2 C172P3 C185N1 C172N2
## New.CleanUp.ReferenceOTU155901 0 0 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 1 0 0 0
## KC551585.1.1451 3 2 0 2 1 6
## JQ945994.1.1399 0 0 0 0 0 0
## EF653577.1.1339 0 0 0 0 0 0
## JQ814729.1.1495 4 13 1 3 22 21
## C178S1 C185S1 C172S2 C178N2
## New.CleanUp.ReferenceOTU155901 0 0 0 0
## New.CleanUp.ReferenceOTU321320 0 0 0 0
## KC551585.1.1451 0 1 0 3
## JQ945994.1.1399 0 0 3 0
## EF653577.1.1339 0 0 0 0
## JQ814729.1.1495 2 3 21 18
sample.summary <- data.frame(sample_data(biom)) %>%
group_by(Site,Date) %>%
summarize(total=n())
sample.summary
## Source: local data frame [18 x 3]
## Groups: Site [?]
##
## Site Date total
## <fctr> <fctr> <int>
## 1 North 172 3
## 2 North 178 3
## 3 North 185 3
## 4 North 199 3
## 5 North 206 3
## 6 North 214 3
## 7 Point 172 3
## 8 Point 178 3
## 9 Point 185 3
## 10 Point 199 3
## 11 Point 206 2
## 12 Point 214 3
## 13 South 172 3
## 14 South 178 3
## 15 South 185 3
## 16 South 199 3
## 17 South 206 2
## 18 South 214 3
# Takes the sample data we just connected to the phyloseq file, in the form of a dataframe,
# Groups it by each combination of sample site and date
# reports the total number of samples we have for each site/date combo - we have 2-3 samples for each.
variables <- expand.grid(Site=c("North","South"),Date=c("172","178","185","199","206","214"))
variables
## Site Date
## 1 North 172
## 2 South 172
## 3 North 178
## 4 South 178
## 5 North 185
## 6 South 185
## 7 North 199
## 8 South 199
## 9 North 206
## 10 South 206
## 11 North 214
## 12 South 214
# Creating a matrix with the different combinations of variables for Site and date
# We are creating a function called Dif_Abund that will run deseq on each subset of factors (day and Site),
# and return the factors, OTU ID, base mean, log2-fold change, the SE, and the p value associated with the response.
# This is testing for the question, on each date, are there differences between sample sites?
Dif_Abund = function(Site,Date){
biom.pruned = prune_samples(sample_data(biom)$Date == paste(Date),biom)
biom.pruned = prune_samples((sample_data(biom.pruned)$Site == paste(Site) | sample_data(biom.pruned)$Site == "Point"),biom.pruned)
# Pulling out the samples we want to compare - we will run DESeq for each date. So, we pull out the rows from the site we are interested in,
# and then also the baseline location (North) for that day, from our biom object.
biom.pruned = prune_taxa(taxa_sums(biom.pruned) > 0, biom.pruned)
# Cutting out any OTUs that don't show up in this comparison
taxonomy <- data.frame(tax_table(biom.pruned))
# Storing the taxonomy data
dseq = phyloseq_to_deseq2(biom.pruned, ~Site)
# Creating the dseq file, and using the formula that tells it we are interested in Site as a factor, and date as the baseline.
dseq$Site = relevel(dseq$Site,"Point")
# Establishing that the baseline comparison is for the first day.
dseq = DESeq(dseq, quiet = TRUE, fitType = "local")
# Running the actual DESeq function
results = results(dseq, cooksCutoff=TRUE)
# Reporting the results from the dseq object we just created
# You can change Cooks Cutoff to control outliers (TRUE) or not (FALSE)
results$Date = Date
results$Site = Site
# Creates a column in the results dataframe listing the Site
results = data.frame(results$Site,results$Date,rownames(results),results$baseMean,results$log2FoldChange,results$lfcSE,results$pvalue,taxonomy[,1:7])
# Pulling out all the results we are actually interested in
colnames(results) = c("Site","Date","OTU","baseMean","l2FC","SE","pvalue","Kingdom","Phylum","Class","Order","Family","Genus","Species")
# Naming the columns appropriately
results
# Return the final results dataframe
}
DA <- mdply(variables, Dif_Abund)
# Runs the differential abundance function we created above on all the combinations of variables
# We are leaving out the first variable, because that is the comparison baseline.
# The formula, as written above, will not tell us if the sites are different from one to the next
# It is looking for OTUs that are increased in relative abundance (compared to the first sampling day)
# across all sites, but accounting for the fact that the sites have different baselines, basically.
threshold = function (thresh){
filter(DA, baseMean >= thresh) %>%
mutate(padj = p.adjust(pvalue,"BH")) %>%
summarize(cutoff=thresh, count=sum(padj<=0.10, na.rm = TRUE))
}
# Creates a function called "threshold" that will take our DA table as created above, then filter it so that we only keep
# rows where the baseMean (roughly the relative abundance for that OTU) is greater than some threshold, "thresh"
# Then, for those remaining rows, we run a Benjamini-Hochberg correction on our p-values, outputting these adjusted
# values in a new colum ("padj")
# Finally, we report back the threshold, and then the sum of taxa for which we have adjusted p values
# less than or equal to 0.10, our chosen false discovery rate
range = seq(0,3,0.25)
# Creates a sequence of numbers we are interested in for adjusted p values (from 0-3 by 0.25 increments)
thresh.s <- ldply(range, threshold)
# Applys the Threshold function we created above to the range of numbers we created above.
plot(thresh.s$count~thresh.s$cutoff)

# We can plot the threshold for base Mean value against the number of samples that will pass under this cutoff.
# We can see the optimum value to use here (here, 1.5)
l2fc <- group_by(DA, Site, Date) %>%
filter(baseMean>=1.5) %>%
mutate(padj = p.adjust(pvalue,"BH"))
# We take that differential abundance table we created above ("DA"), and take the sum of the baseMean for each date
# Then we filter the whole set of values to include only those OTUs for which their baseMean was >1.5,
# We then adjust the p values only for those that we expect might be significant.
# I.e., those of very low relative abundance in the first place are not getting assessed at all,
# Allowing us to not test their significance - so untestested values will have NA in the padj column.
dim(l2fc[is.na(l2fc$padj)==TRUE,])[1]/dim(l2fc[])[1]
## [1] 0.007051813
# Fraction of OTUs that were not tested for significance (padj is "NA")
# We didn't actually cut it down that much, but it might help.
cutoff = 1
FDR = 0.1
d = l2fc %>%
group_by(Site,Date)%>%
mutate(Sig = ifelse(padj<FDR&l2FC>=cutoff,1,0))%>%
mutate(Sig = ifelse(is.na(padj)==TRUE,0,Sig))%>%
group_by(Site,Date)%>%
count(Site,Date,Sig)%>%
group_by(Site,Date)%>%
mutate(Fraction=n/sum(n))
d
## Source: local data frame [24 x 5]
## Groups: Site, Date [12]
##
## Site Date Sig n Fraction
## <fctr> <fctr> <dbl> <int> <dbl>
## 1 North 172 0 2355 0.94426624
## 2 North 172 1 139 0.05573376
## 3 North 178 0 1754 0.84570878
## 4 North 178 1 320 0.15429122
## 5 North 185 0 1746 0.84063553
## 6 North 185 1 331 0.15936447
## 7 North 199 0 1909 0.83289703
## 8 North 199 1 383 0.16710297
## 9 North 206 0 3034 0.95528967
## 10 North 206 1 142 0.04471033
## # ... with 14 more rows
# Creates a little table showing which fraction of OTUs were designated as significant or not on each date
# A good number are - 10-30%
FDR = 0.1
d = l2fc
d = group_by(d, Site,Date) %>%
mutate(sig = ifelse(padj<FDR,1,0))%>%
mutate(Total=sum(baseMean)) %>%
mutate(relabund=baseMean/Total)%>%
filter(padj != 'NA')
# Creates a column designating whether (1) or not (0) the padj is lower than our false discovery rate (set above)
# We also calculate a new relative abundance value using the average across samples
# and we cut out those rows with no p-values
PhylumFraction = 0.005
OTUFraction = 0.001
PhylumList = d %>%
group_by(Phylum,Date)%>%
filter(sum(relabund)>=PhylumFraction | relabund>=OTUFraction)
PhylumList = levels(droplevels(PhylumList$Phylum))
# Looks at each phylum, on each date, and sees if the total abundance of OTUs within that phylum is
# greater than the cutoff (PhylumFraction, defined above), OR if the single OTU makes up a large enough fraciton
# of the total community on its own (OTUFraction, defined above)
# Thus, we have selected the phyla we want to plot.
d = filter(d, Phylum %in% PhylumList)
max.l2FC = ddply(d, .(Phylum), summarize, M = max(l2FC))
d$Phylum = factor(d$Phylum, max.l2FC[order(-max.l2FC$M),]$Phylum)
# makes a dataframe with the maximum value of log2Fold change for each phylum
# takes our phylum column, and arranges it in order of our log2FoldChange values
d$sig = as.factor(d$sig)
d.yes = d[d$sig==1,]
d.no = d[d$sig==0,]
# Creates new columns (true/false) for whether it is significant or not
p = ggplot(d)
p = p + geom_point(data=d.yes, aes(x = Phylum, y = l2FC, fill = Phylum, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
# p = p + geom_point(data=d.no, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class
p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized
p = p + facet_grid(Date~Site, scales="free_x")
# Saying we want it to present the data separately for each phylum
p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values
p = p + theme_bw()
# Sets a theme
p = p + theme(strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16),
axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size = 18),
legend.title = element_text(size=18),
legend.text = element_text(size = 14),
#legend.position = "none",
strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text
p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.
p

dProteo = d %>%
filter(Phylum=="Proteobacteria")
d.yesProteo = d.yes %>%
filter(Phylum=="Proteobacteria")
d.noProteo = d.no %>%
filter(Phylum=="Proteobacteria")
# Subsetting our data for Proteos
p = ggplot(dProteo)
p = p + geom_point(data=d.yesProteo, aes(x = Order, y = l2FC, fill = Order, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
# p = p + geom_point(data=d.noProteo, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class
p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized
p = p + facet_grid(Date~Site~Class, scales="free_x")
# Saying we want it to present the data separately for each phylum
p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values
p = p + theme_bw()
p = p + theme(strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16),
axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size = 18),
legend.title = element_text(size=18),
legend.text = element_text(size = 14),
#legend.position = "none",
strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text
p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.
p + theme(legend.position="none")

dBacter = d %>%
filter(Phylum=="Bacteroidetes")
d.yesBacter = d.yes %>%
filter(Phylum=="Bacteroidetes")
d.noBacter = d.no %>%
filter(Phylum=="Bacteroidetes")
p = ggplot(dBacter)
p = p + geom_point(data=d.yesBacter, aes(x = Class, y = l2FC, fill = Class, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
#p = p + geom_point(data=d.noBacter, aes(x = Class, y = l2FC, fill = Class, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class
p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized
p = p + facet_grid(Date~Site, scales="free_x")
# Saying we want it to present the data separately for each phylum
p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values
p = p + theme_bw()
# Sets a theme
p = p + theme(strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16),
axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size = 18),
legend.title = element_text(size=18),
legend.text = element_text(size = 14),
#legend.position = "none",
strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text
p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes.
p + theme(legend.position="none")

# Find methanotrophs
methanolist <- read.table(file = "methanos.txt")
methanolist <- as.vector(methanolist$V1)
# Create a list of all the possible methanotroph genera
df = l2fc %>%
filter(Genus %in% as.factor(methanolist))
# Take out only the methanotrophs
df = group_by(df, Site,Date) %>%
mutate(sig = ifelse(padj<FDR,1,0))%>%
mutate(Total=sum(baseMean)) %>%
mutate(relabund=baseMean/Total)%>%
filter(padj != 'NA')
# Creates a column designating whether (1) or not (0) the padj is lower than our false discovery rate (set above)
# We also calculate a new relative abundance value using the average across samples
# and we cut out those rows with no p-values
df$sig = as.factor(df$sig)
df.yes = df[df$sig==1,]
df.no = df[df$sig==0,]
# Creates new columns (true/false) for whether it is significant or not
p = ggplot(df)
p = p + geom_point(data=df.yes, aes(x = Date, y = l2FC, fill = Site, size=relabund), shape = 21, alpha=0.4, position = position_jitter(w = 0.20))
p = p + geom_point(data=df.no, aes(x = Date, y = l2FC, fill = sig, size=relabund), shape = 21, alpha=0.3, position = position_jitter(w = 0.20))
# Plotting the significant and the nonsignificant points differently
# size is proportional to relative abundance, and we colour it by Class
p = p + scale_size_continuous("log(Relative\nAbundance)",trans="log",guide="none")
# Transforming the size of the points so it's better visualized
p = p + facet_grid(~Genus, scales="free_x")
# Saying we want it to present the data separately for each phylum
p = p + geom_hline(yintercept = 1, linetype=2)
p = p + geom_hline(yintercept = -1, linetype=2)
p = p + geom_hline(yintercept = 3.3219, linetype=3)
p = p + geom_hline(yintercept = -3.3219, linetype=3)
p = p + geom_hline(yintercept = 0.0, linetype=1)
# Puts in horizontal lines at reference values
p = p + theme_bw()
p = p + theme(strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16),
axis.text.x = element_text(size = 14, angle = 45, hjust = 1, vjust = 1, face="italic"),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size=16),
axis.title.y = element_text(size = 18),
legend.title = element_text(size=18),
legend.text = element_text(size = 14),
#legend.position = "none",
strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# sets a bunch of visual paramters for the legend (none) and other text
p = p + labs(y = expression(paste("", log[2]," fold change vs. Point site",sep="")))
# sets the label for the y axes
p

mdf = psmelt(biom)
#"melting" the phyloseq data into a dataframe
# We want to calculate the total relative abundance of each phylum
# and then take the top X most abundant phyla
SortedPhyla = mdf %>%
group_by(Sample,Phylum) %>%
summarize(PhyAbund = sum(Abundance))%>%
# Should give us the relative abundance of all OTUs in each phylum from each sample.
group_by(Phylum)%>%
summarize(MeanPhyAbund = mean(PhyAbund))%>%
arrange(-MeanPhyAbund)
# Then sums the mean relative abundance of each phylum, across samples, and reports it from most to least
nPhyla = 13
# How many phyla do we want to include (plus 1 for NAs)?
PhylumList = SortedPhyla[1:nPhyla,1]
PhylumList = PhylumList[is.na(PhylumList)==FALSE,]
PhylumList = levels(droplevels(as.factor(PhylumList$Phylum)))
# List of nPhyla top most abundant phyla
PhylumList
## [1] "[Thermi]" "Acidobacteria" "Actinobacteria"
## [4] "Armatimonadetes" "Bacteroidetes" "Cyanobacteria"
## [7] "Gemmatimonadetes" "GN02" "Planctomycetes"
## [10] "Proteobacteria" "SR1" "Verrucomicrobia"
biom2 = subset_taxa(biom, Phylum %in% PhylumList)
mdf2 = psmelt(biom2)
head(mdf2)
## OTU Sample Abundance TreatmentGroup Site Date
## 231063 HQ178949.1.1333 C206N3 51716 Late North 206
## 2001312 New.ReferenceOTU480 C172P3 46742 Early Point 172
## 2001291 New.ReferenceOTU480 C206N2 28094 Late North 206
## 2012320 New.ReferenceOTU806 C206N2 23022 Late North 206
## 2018804 New.ReferenceOTU992 C178S2 21465 Early South 178
## 2001273 New.ReferenceOTU480 C172P2 20245 Early Point 172
## Description SampleID.1 DateSite Kingdom
## 231063 Sample of day 206 at site North 3 C206N3 206 North Bacteria
## 2001312 Sample of day 172 at site Point 3 C172P3 172 Point Bacteria
## 2001291 Sample of day 206 at site North 2 C206N2 206 North Bacteria
## 2012320 Sample of day 206 at site North 2 C206N2 206 North Bacteria
## 2018804 Sample of day 178 at site South 2 C178S2 178 South Bacteria
## 2001273 Sample of day 172 at site Point 2 C172P2 172 Point Bacteria
## Phylum Class Order Family
## 231063 Proteobacteria Gammaproteobacteria Aeromonadales Aeromonadaceae
## 2001312 Cyanobacteria Chloroplast Stramenopiles <NA>
## 2001291 Cyanobacteria Chloroplast Stramenopiles <NA>
## 2012320 [Thermi] Deinococci Thermales Thermaceae
## 2018804 Proteobacteria Gammaproteobacteria Methylococcales Crenotrichaceae
## 2001273 Cyanobacteria Chloroplast Stramenopiles <NA>
## Genus Species
## 231063 <NA> <NA>
## 2001312 <NA> <NA>
## 2001291 <NA> <NA>
## 2012320 Meiothermus <NA>
## 2018804 Crenothrix <NA>
## 2001273 <NA> <NA>
mdf2 = mdf2 %>%
group_by(Sample,Site,Date,Phylum) %>%
summarize(PhyAbund = sum(Abundance))
p = ggplot(mdf2, aes(Date, PhyAbund, fill = Date))
p = p + geom_boxplot()
p = p + facet_wrap(~Phylum, scales = "free_y")
#p = p + scale_fill_manual(values=c("orange","gold1","red3"))
p = p + theme_bw()
p = p + guides(fill = "none")
p = p + theme(legend.position = "none")
p = p + theme(strip.text.x = element_text(size=11),
strip.text.y = element_text(size=9),
strip.background = element_rect(colour="white", fill="white"))
p = p + theme(axis.text.x = element_text(size = 9, angle = 45, hjust=1),axis.text.y = element_text(size = 9))
p = p + theme(axis.title.x = element_blank())
p = p + theme(axis.title.y = element_text(size = 15, vjust = 1))
p = p + labs(x="Amendment",y="Relative abundance")
p
